rm(list = ls())
pacman::p_load(xtable, readstata13, foreign, dplyr, data.table, lfe, ggplot2, rgdal, readxl, forcats, viridis,
               stringr, wesanderson, rgeos, smoothr, readr, fwildclusterboot, remotes, vdemdata, haven, sf)

dir = dirname(rstudioapi::getSourceEditorContext()$path) # set directory
setwd(dir)

################################################################################
# Figures using Cross-national data (Figures 1, A1, A2, A3, A4) ################
################################################################################

# Cross-national data on registration inequality (Figure 1, Figure A2)
################################################################################

dat <- fread("Data/Cross-national/registration.csv") 

dat$log_gdp_cap <- log(dat$gdp_cap)

dat$label <- 'RoW'
dat$label[dat$region=='SSF'] <- 'SSA'
dat$label[dat$country=='Tanzania'] <- 'Tanzania'
dat$label <- factor(dat$label, levels=unique(dat$label))

make.plot <- function(x_var, y_var, x_lab, y_lab, xlim, ylim, leg_place){
  df <- dat %>% select(xv=matches(x_var), yv=matches(y_var), label)
  p <- ggplot(data=df, aes(x=xv, y=yv))+theme_bw()+
    geom_point(aes(group=label, color=label, shape=label), alpha=0.85)+xlab(x_lab)+ylab(y_lab)+
    geom_smooth(se=F,span=2,colour="grey30",fullrange=T)+
    geom_hline(aes(yintercept=0), lty=2, colour="grey20")+
    scale_x_continuous(limits=xlim)+scale_y_continuous(limits=ylim)+
    scale_colour_manual(values=c("grey70","#29AF7FFF","#481567FF"))+
    scale_shape_manual(values=c(19,19,17))+
    theme(legend.position=leg_place,legend.key=element_rect(colour='black',fill=NULL,size = 0.5),legend.title=element_blank())
  return(p)
}

# Figure 1 (GDP per capita)
ggsave("Figures/Figure 1a.pdf", make.plot(x_var="log_gdp_cap",y_var="pct_birthreg",x_lab="Log GDP per capita",y_lab="Registration (%)",xlim=c(5.75,10),ylim=c(0,100),leg_place=c(0.8,0.2)), width=4, height=4)
ggsave("Figures/Figure 1b.pdf", make.plot(x_var="pct_birthreg",y_var="r_p_diff",x_lab="Registration (%)",y_lab="Rich-poor registration difference",xlim=c(0,100),ylim=c(-5,75),leg_place=c(0.82,0.85)), width=4, height=4)

# Figure A2 ()
ggsave("Figures/Figure A2a.pdf", make.plot(x_var="log_gdp_cap",y_var="pct_birthreg",x_lab="Log GDP per capita",y_lab="Registration (%)",xlim=c(5.75,10),ylim=c(0,100),leg_place=c(0.8,0.2)), width=4, height=4)
ggsave("Figures/Figure A2b.pdf", make.plot(x_var="log_gdp_cap",y_var="r_p_diff",x_lab="Log GDP per capita",y_lab="Rich-poor registration difference",xlim=c(5.75,10),ylim=c(-5,75),leg_place=c(0.82,0.85)), width=4, height=4)

ggsave("Figures/Figure A2c.pdf", make.plot(x_var="sc_2015",y_var="pct_birthreg",x_lab="State capacity",y_lab="Registration (%)",xlim=c(-2.5,2.5),ylim=c(0,100),leg_place=c(0.8,0.2)), width=4, height=4)
ggsave("Figures/Figure A2d.pdf", make.plot(x_var="sc_2015",y_var="r_p_diff",x_lab="State capacity",y_lab="Rich-poor registration difference",xlim=c(-2.25,2.25),ylim=c(-5,75),leg_place=c(0.82,0.85)), width=4, height=4)

# State capacity over time (Figure A1, Figure A3)
################################################################################

dat <- fread("Data/Cross-national/capacity.csv")

dat$label <- "RoW"
dat$label[dat$WBregion==8] <- "SSA"
dat$label[dat$country=="Tanzania"] <- "Tanzania"
dat$label <- factor(dat$label, levels=unique(dat$label))

dat$alph <- 0.15
dat$alph[dat$label=='SSA'] <- 0.3
dat$alph[dat$label=="Tanzania"] <- 1

dat$country <- factor(dat$country)
dat$country <- fct_rev(relevel(dat$country, ref="Tanzania"))

p <- ggplot(data=dat, aes(x=year, y=z_capacity, group=country, colour=label))+
  geom_hline(aes(yintercept=0),lty=2,alpha=0.7)+
  geom_path(alpha=dat$alph)+
  scale_colour_manual(name=NULL,values=c("grey70","#29AF7FFF","#481567FF"))+
  theme_bw()+xlab(NULL)+ylab("State capacity")+
  scale_x_continuous(breaks=seq(1960,2010,10))+
  theme(legend.position=c(0.8,0.8), legend.key=element_rect(colour='black', fill=NULL, size=0.5))

ggsave("Figures/Figure A3.pdf", plot=p, width=5, height=4)

# Figure A1: Exclusion from public resources based on economic versus social status
################################################################################

sum <- dat %>% group_by(country, label) %>% 
  summarise(across(starts_with('v2xpe_'), function(x) mean(na.omit(x)))) %>% 
  ungroup() %>% mutate(across(starts_with('v2xpe_'), scale))

p <- ggplot(data=sum, aes(x=v2xpe_exlecon, y=v2xpe_exlsocgr))+
  geom_point(aes(group=label, colour=label, shape=label),alpha=0.8)+
  geom_smooth(colour='grey30', alpha=0)+
  theme_bw()+
  scale_colour_manual(name=NULL, values = c("grey70","#29AF7FFF","#481567FF"))+ 
  scale_shape_manual(name=NULL,values=c(19,19,17))+
  xlab("Socio-economic group exclusion")+ylab("Social group exclusion")+
  theme(legend.position=c(0.8,0.2), legend.key=element_rect(colour='black', fill=NULL, size=0.5))

ggsave("Figures/Figure A1.pdf", plot=p, width=4, height=4)

# Figure A4: Expansion of civil registries across SSA
################################################################################

dat <- fread("Data/Cross-national/civil_registries.csv")
dat$share_with_cr <- dat$cumulative_sum / dat$n_countries

p <- ggplot(data=dat, aes(x=year, y=share_with_cr))+theme_bw()+
  geom_step(colour="#29AF7FFF")+scale_x_continuous(breaks=seq(1900,2015,20))+ylim(c(0,1))+
  ylab("Share of SSA countries\nwith civil registries") + xlab(NULL)

ggsave("Figures/Figure A4.pdf", plot=p, width=4.5, height=3)


################################################################################
# Figures using Census data (Figures A6, A7, A8, A9) ###########################
################################################################################

d <- read_dta('Data/Analysis/census_bw10.dta') # file produced by make_tables_census.do

### Figures A6, A8: Trends in different outcome variables
################################################################################

df <- d %>% filter(abs(bw)<=10) %>% 
  mutate(bin=cut(bw, breaks=seq(-10,10,2), right=F)) %>% 
  group_by(group, bw) %>% 
  summarise(across(c("b_cert","ed_primary","ed_secondary","ed_uni","s_nih","s_pens_priv","s_pens_pub"),mean))
df$label <- NA
df$label[df$group==1] <- 'Treated district'
df$label[df$group==0] <- 'Control district'
df$label <- factor(df$label, levels=c("Control district","Treated district"))

p <- ggplot(data=df, aes(x=bw, y=b_cert, group=label, colour=factor(label)))+theme_bw()+ylim(c(0,0.5))+
  geom_vline(aes(xintercept=0),colour='darkgrey',linetype=2)+geom_path()+
  xlab('Birth year relative to reform')+ylab('Share registered')+ 
  theme(legend.title=element_blank(), legend.position=c(.25, .8))+
  scale_colour_manual(values=c("#29AF7FFF","#481567FF"))+
  scale_x_continuous(breaks=seq(-10,10,2))

ggsave("Figures/Figure A6a.pdf",plot=p,width=3.5,height=3.5)

df_ed <- tidyr::pivot_longer(df,cols=ed_primary:ed_uni)

df_ed$name[df_ed$name=='ed_primary'] <- 'Primary'
df_ed$name[df_ed$name=='ed_secondary'] <- 'Secondary'
df_ed$name[df_ed$name=='ed_uni'] <- 'University'

df_ss <- tidyr::pivot_longer(df,cols=s_nih:s_pens_pub)

df_ss$name[df_ss$name=='s_nih'] <- 'HI'
df_ss$name[df_ss$name=='s_pens_priv'] <- 'Private'
df_ss$name[df_ss$name=='s_pens_pub'] <- 'State'

gen.plot.rf <- function(df_input, y_lim, leg_position, y_lab) {
  plot.rf <- ggplot(data=df_input, aes(x=bw, y=value, group=label, colour=factor(label)))+
    facet_grid(cols=vars(name))+
    theme_bw()+ylim(y_lim)+
    geom_vline(aes(xintercept=0),colour='darkgrey',linetype=2)+geom_path()+
    xlab('Birth year relative to reform')+ylab(y_lab)+
    theme(legend.title=element_blank(), legend.position=leg_position,
          strip.background =element_rect(fill="white"))+
    scale_colour_manual(values=c("#29AF7FFF","#481567FF"))
  return(plot.rf)
}

ggsave("Figures/Figure A8a.pdf",plot=gen.plot.rf(df_ed, c(0,1), c(.85, .83), 'Share educated'),width=5,height=3)
ggsave("Figures/Figure A8b.pdf",plot=gen.plot.rf(df_ss, c(0,0.2), c(.85, .83), 'Share with access'),width=5,height=3)

### Figure A7: Estimates of first stage while excluding districts
################################################################################

baseline_fs <- as.numeric(felm(b_cert ~ treat + pl_male + pl_male:dcode_1967_g | dcode_1967_g + yob | 0 | dcode_1967_g, data=d)$beta[1])
districts <- unique(d$district_1967[d$group==1])

gen.reg <- function(index){
  print(index)
  exclude <- districts[index]
  print(exclude)
  df <- d[!(d$b_district_txt %in% exclude),]
  out <- felm(b_cert ~ treat + pl_male + pl_male:dcode_1967_g | yob + dcode_1967_g | 0 | dcode_1967_g, data=df)
  return(data.frame(index=index, coef=out$beta[1], se=out$cse[1], exclude))
}

results <- bind_rows(lapply(1:length(districts), function(x) gen.reg(x)))
colnames(results) <- c('id','coef','se','district')
results <- results %>% arrange(coef)
results$district <- str_trim(gsub("Mjini","",results$district))
results$district <- factor(results$district, levels=unique(results$district))

p <- ggplot(data=results, aes(y=district))+ theme_bw()+
  geom_vline(aes(xintercept=baseline_fs), colour='darkgrey')+
  geom_vline(aes(xintercept=0), alpha=1, colour='indianred')+
  geom_errorbarh(aes(xmin=coef-(1.96*se),xmax=coef+(1.96*se)),height=0, colour='grey20')+
  geom_errorbarh(aes(xmin=coef-(1.645*se),xmax=coef+(1.645*se)),height=0.3, colour='grey20')+
  geom_point(aes(x=coef),colour='grey30')+
  xlab(bquote("Effect of reform on registration"~(beta^FS)))+
  ylab("Excluded treated district")+
  scale_x_continuous(breaks=seq(0.0,0.1,0.02), limits = c(0,0.1))

ggsave("Figures/Figure A7.pdf",plot=p,width=4,height=4.5)

### Figure A9: Effects on education access by grade
################################################################################

gen.reg <- function(dv){
  print(dv)
  fmla <- as.formula(paste(dv,  '~ pl_male + pl_male:dcode_1967_g | yob + dcode_1967_g | (b_cert~treat) | dcode_1967_g', sep=' '))
  out <- felm(fmla, data=d)
  out <- data.frame(dv=dv, coef=out$coefficients[length(out$coefficients)], se=out$cse[length(out$cse)])
  return(out)
}

ed_vars <- colnames(d[startsWith(colnames(d),"ed_attain")])

ed_reg <- bind_rows(lapply(ed_vars, gen.reg))
ed_reg$index <- 1:length(ed_vars)

ed_reg$label <- c('P1','P2','P3','P4','P5','P6','P7','S1','S2','S3','S4','S5','S6','Uni')

p <- ggplot(data=ed_reg, aes(x=label))+
  theme_bw()+xlab('Education level')+ylab('Coefficient')+
  geom_hline(aes(yintercept=0), alpha=1, colour='indianred')+
  geom_errorbar(aes(ymin=coef-(1.96*se),ymax=coef+(1.96*se)),width=0, colour='grey30')+
  geom_errorbar(aes(ymin=coef-(1.645*se),ymax=coef+(1.645*se)),width=0.2, colour='grey30')+
  geom_point(aes(y=coef),colour='grey30')

ggsave("Figures/Figure A9.pdf",plot=p,width=4,height=2.5)

################################################################################
# Analysis using NPS data (Table 5, Figures A6, A8) ############################
################################################################################

d <- read_dta('Data/Analysis/nps_analysis.dta') # file produced by make_tables_nps.do

### Table 5: Complier characteristics
################################################################################

gen.kappa <- function(varname){
  df <- d %>% filter(abs(bw)<=10 & !is.na(group))
  fs <- felm(b_cert ~ treat | dcode_1967_g + yob + round | 0 | dcode_1967_g, data=df)
  df$var <- as.numeric(as.matrix(select(df, any_of(varname)))) 
  var.mean <- mean(na.omit(df$var))
  df <- df%>%mutate(t=case_when(var!=0~treat,var==0~0))
  out <- felm(b_cert ~ t | yob + dcode_1967_g + round | 0 | dcode_1967_g, data=df)
  c.ratio <- out$coefficients[1]/fs$coefficients[1]
  c.mean <- c.ratio*var.mean
  return(data.frame(varname, var.mean, c.mean, c.ratio))
}

c <- bind_rows(lapply(c("pl_male","ed_parent_pri","ed_parent_sec","ed_parent_uni"), function(x) gen.kappa(x)))

c$Variable <- c('Male','Parent has primary education','Parent has secondary education',
                'Parent has university education')
c <- dplyr::select(c, Variable, 
                   `\\specialcell{Sample \\\\ mean \\\\ (1)}`=var.mean, 
                   `\\specialcell{Complier \\\\ mean \\\\ (2)}`=c.mean, 
                   `\\specialcell{\\\\ Ratio \\\\ (3)}`=c.ratio)

tab <- datasummary_df(c,escape=F,align="lccc",fmt=2,output="latex",title="Complier characteristics\\label{tab:compliers}") %>% 
  add_footnote("\\StarnoteCompliers", threeparttable = T, escape=F, notation="none")

save_kable(tab, file = "Tables/Table 5.tex", escape=F, format="latex")


### Complier characteristics (Supplementary results)
################################################################################

gen.kappa <- function(varname, input, treat, ctrl, fe, cluster){
  df <- input
  df$t <- as.numeric(as.matrix(select(df, any_of(treat)))) 
  df$var <- as.numeric(as.matrix(select(df, any_of(varname)))) 
  fmla <- as.formula(paste("b_cert~t",ctrl,"|",fe,"| 0 |",cluster))
  fs <- felm(fmla,data=df)
  var.mean <- mean(na.omit(df$var))
  df <- df%>%mutate(t=case_when(var!=0~t,var==0~0))
  out <- felm(fmla, data=df)
  c.ratio <- out$coefficients[1]/fs$coefficients[1]
  c.mean <- c.ratio*var.mean
  return(data.frame(varname, var.mean, c.mean, c.ratio))
}

vars <- c("pl_male","ed_parent_pri","ed_parent_sec","ed_parent_uni")

distvars <- paste(colnames(d%>%select(starts_with(c("z_jm","z_lee","z_sch")))),collapse="+")
indvars <- paste(colnames(d%>%select(starts_with("pl_"))),collapse="+")

baseline <- bind_rows(lapply(vars, function(x) gen.kappa(x,d%>%filter(abs(bw)<=10),"treat","","yob + dcode_1967_g + round","dcode_1967_g")))

A1 <- bind_rows(lapply(vars, function(x) gen.kappa(x,d%>%filter(abs(bw)<=5),"treat","","yob + dcode_1967_g + round","dcode_1967_g")))
A2 <- bind_rows(lapply(vars, function(x) gen.kappa(x,d,"treat","","yob + dcode_1967_g + round","dcode_1967_g")))

B1 <- bind_rows(lapply(vars, function(x) gen.kappa(x,d%>%filter(abs(bw)<=10&bw!=0),"treat","","yob + dcode_1967_g + round","dcode_1967_g") ))
B2 <- bind_rows(lapply(vars, function(x) gen.kappa(x,d%>%filter(abs(bw)<=10&age!=40&age!=45&age!=50&age!=55),"treat","","yob + dcode_1967_g + round","dcode_1967_g") ))

C1 <- bind_rows(lapply(vars, function(x) gen.kappa(x,d%>%filter(abs(bw)<=10&bw!=0),"treat",paste("+post:(",distvars,")"),"yob + dcode_1967_g + round","dcode_1967_g") ))
C2 <- bind_rows(lapply(vars, function(x) gen.kappa(x,d%>%filter(abs(bw)<=10&bw!=0),"treat",paste("+(",indvars,")"),"yob + dcode_1967_g + round","dcode_1967_g") ))

D1 <- bind_rows(lapply(vars, function(x) gen.kappa(x,d%>%filter(abs(bw)<=10),"treat_urban","","yob + dcode_1967_g + round","dcode_1967_g") ))
D2 <- bind_rows(lapply(vars, function(x) gen.kappa(x,d%>%filter(abs(bw)<=10),"treat_all","","yob + dcode_1967_g + round","dcode_1967_g") ))

E1 <- bind_rows(lapply(vars, function(x) gen.kappa(x,d%>%filter(abs(bw)<=10),"treat","","yob + dcode_2012 + round","dcode_2012") ))
E2 <- bind_rows(lapply(vars, function(x) gen.kappa(x,d%>%filter(abs(bw)<=10),"treat","","yob + dcode_1967_g + round","dcode_1967_y") ))

gen.table <- function(input, filename, edit_title){
  input$Variable <- c('Male','Parent has primary education','Parent has secondary education',
                  'Parent has university education')
  input <- dplyr::select(input, Variable, 
                     `\\specialcell{Sample \\\\ mean \\\\ (1)}`=var.mean, 
                     `\\specialcell{Complier \\\\ mean \\\\ (2)}`=c.mean, 
                     `\\specialcell{\\\\ Ratio \\\\ (3)}`=c.ratio)
  tab <- datasummary_df(input,escape=F,align="lccc",fmt=2,output="latex",title=edit_title) %>% 
    add_footnote("\\StarnoteCompliers", threeparttable = T, escape=F, notation="none")
  save_kable(tab, file = paste('Tables/Additional results/Table 5_',filename,'.tex',sep=''), escape=F, format="latex")
}

gen.table(A1,"A1","Complier characteristics (+/- 5 cohorts)") # +/- 5 cohorts
gen.table(A2,"A2","Complier characteristics (All cohorts)") # Unrestricted cohorts
gen.table(B1,"B1","Complier characteristics (-Reform year)") # -Reform year
gen.table(B2,"B2","Complier characteristics (-Heaped ages)") # -Heaped ages
gen.table(C1,"C1","Complier characteristics (District-level)") # District-level controls
gen.table(C2,"C2","Complier characteristics (Individual-level)") # Individual-level controls
gen.table(D1,"D1","Complier characteristics (Urban areas in 1967)") # Urban control areas
gen.table(D2,"D2","Complier characteristics (Unrestricted)") # Unrestricted control areas
gen.table(E1,"E1","Complier characteristics (2012 district FEs)") # 2012 district of birth FEs
gen.table(E2,"E2","Complier characteristics (District-cohort clustering)") # District-cohort clustering


### Figures A6, A8: Trends in different outcome variables
################################################################################

df <- d %>% filter(abs(bw)<=10 & !is.na(group)) %>% 
  mutate(bin=cut(bw, breaks=seq(-10,10,2), right=F)) %>% 
  group_by(group, bw) %>% 
  summarise(across(c("b_cert",starts_with("tax_")),mean)) 
df$label <- NA
df$label[df$group==1] <- 'Treated district'
df$label[df$group==0] <- 'Control district'
df$label <- factor(df$label, levels=c("Control district","Treated district"))

p <- ggplot(data=df, aes(x=bw, y=b_cert, group=label, colour=factor(label)))+theme_bw()+ylim(c(0,0.5))+
  geom_vline(aes(xintercept=0),colour='darkgrey',linetype=2)+geom_path()+
  xlab('Birth year relative to reform')+ylab('Share registered')+ 
  theme(legend.title=element_blank(), legend.position=c(.25, .8))+
  scale_colour_manual(values=c("#29AF7FFF","#481567FF"))+
  scale_x_continuous(breaks=seq(-10,10,2))

ggsave("Figures/Figure A6b.pdf",plot=p,width=3.5,height=3.5)

df_tax <- tidyr::pivot_longer(df,cols=starts_with("tax"))
df_tax$name[df_tax$name=='tax_all'] <- 'Any tax'
df_tax$name[df_tax$name=='tax_fee'] <- 'Fees'
df_tax$name[df_tax$name=='tax_local'] <- 'Local'
df_tax$name[df_tax$name=='tax_central'] <- 'Central'

p <- ggplot(data=df_tax, aes(x=bw, y=value, group=label, colour=factor(label)))+theme_bw()+ylim(c(0,1))+
  geom_vline(aes(xintercept=0),colour='darkgrey',linetype=2)+geom_path()+
  xlab('Birth year relative to reform')+ylab('Share paying tax')+ 
  facet_grid(cols=vars(name))+
  theme(legend.title=element_blank(), legend.position=c(.825, .825), strip.background =element_rect(fill="white"))+
  scale_colour_manual(values=c("#29AF7FFF","#481567FF"))+
  scale_x_continuous(breaks=seq(-10,10,5))

ggsave("Figures/Figure A8c.pdf",plot=p,width=5,height=3) 


################################################################################
# Figures using Tanzanian data (Figures A5, A10, A11) ##########################
################################################################################

# Figure A5: Map of treated and control districts
################################################################################

shp <- read_sf("Data/GIS/districts.shp") %>% filter(Region_Cod<50) # Exclude Zanzibar
town <- fread("Data/GIS/town_locations.csv")

shp$label[is.na(shp$g_parent)] <- "Excluded"
shp$label[shp$g_parent==1] <- "Treated"
shp$label[shp$g_parent==0] <- "Control"
shp$label <- factor(shp$label, levels=c("Treated","Control","Excluded"))

p <- ggplot()+theme_bw()+geom_sf(data=shp, aes(group=label, fill=label))+
  geom_point(data=town%>%filter(town==1), aes(x=long,y=lat), fill="#481567FF", shape=24, colour="grey90", size=3)+
  scale_fill_manual(name=NULL,values=c("#481567FF","#29AF7FFF","grey95"))+
  theme(legend.position=c(0.875,0.875))+xlab(NULL)+ylab(NULL)+
  theme(legend.title=element_blank(),line=element_blank(),panel.border=element_blank(),axis.text.x=element_blank(),axis.text.y=element_blank())

ggsave("Figures/Figure A5.pdf",plot=p,width=5,height=5)

# Figure A10: Expansion of compulsory registration orders in Tanzania
################################################################################

dat <- fread("Data/Descriptive/registration_orders_by_year.csv")

p <- ggplot(data=dat, aes(x=year, y=cumulative_share))+theme_bw()+
  geom_step(color="#481567FF")+ylab("Share of districts")+xlab(NULL)+scale_x_continuous(breaks=seq(1950,2010,20))

ggsave("Figures/Figure A10.pdf", plot=p, width=4, height=2.75)

# Figure A11: Relevant legislation during reform period
################################################################################

dat <- fread("Data/Descriptive/legislation.csv")

p <- ggplot(data=dat, aes(x=year, y=value, group=name, color=name))+theme_bw()+
  scale_color_viridis(discrete=T, option="viridis")+theme(legend.title=element_blank(),legend.position=c(0.8,0.8))+
  geom_line()+ylab("Share of legislation mentioning...")+xlab(NULL)+ylim(c(0,0.3))

ggsave("Figures/Figure A11.pdf", plot=p, width=5, height=4)
